#!/bin/bash

# Copyright (c) Los Alamos National Security, LLC, and others.

# This script extracts a subset of the Wikimedia access logs. You probably
# don't need to run it, since the data are included with QUAC, and you don't
# want to run it, since it takes several hours (to decompress and grep through
# 140GB of data).
#
# Note that files which appear to be already present are not re-copied.
#
# It takes two arguments:
#
# 1. The directory containing the raw Wikimedia data you want to pull from.
# 2. A toggle; if present, the copy step (which is the long one) is not done.

extract () {
    file=$(basename $1)
    month=$(basename $(dirname $1))
    year=$(echo $month | sed -e 's/...$//')
    echo $year/$month/$file
    if [ ! -e raw/$year/$month/$file ]; then
        zcat $1 | fgrep -i sandy | gzip -n9 > raw/$year/$month/$file
    fi
}

mkdir -p raw/2012/2012-{09,10,11}

if [ -z "$2" ]; then  # give a 2nd argument to skip the lengthy copy

    # Some normal, working data. We exclude a few files to create gaps in the
    # time series we are going to test against later.
    #
    # Also, we want to make sure that November is almost, but not quite,
    # closed (to test time series pruning); to do so, we exclude the very last
    # hour in the month. However, this is stamped 2012-12-01 00:00:00, so it's
    # already implicitly excluded by the patterns below.
    for file in $1/2012/2012-09/pagecounts-201209{28,29,30}-*.gz \
                $1/2012/2012-{10,11}/pagecounts-*.gz; do
        if [[    $file == *20121027*         # 1-day gap
              || $file == *20121028-000000*  # 1-day gap off-by-one
              || $file == *20121030-150000*  # 1-hour gap
           ]]; then
            echo "skipping $file"
            continue
        else
            extract $file
        fi
    done

fi

# Pathological and unusual cases. We want to test that we deal with it
# correctly. Important: None of these files should contain any valid data! (We
# want to keep the test data duration nice and compact.) All of these problems
# were downloaded correctly from WMF.
mkdir -p raw/2008/2008-10
mkdir -p raw/2011/2011-01
mkdir -p raw/2011/2011-10
mkdir -p raw/2015/2015-01
mkdir -p raw/2099/2099-01  # synthetic problems go here

# Ends in the middle of a line (real)
zcat $1/2008/2008-10/pagecounts-20081021-090000.gz | tail -1 | gzip -n9 > raw/2008/2008-10/pagecounts-20081021-090000.gz

# Not a gzip file (real, but truncated)
head -10 $1/2010/2010-07/pagecounts-20100707-112221.tmp > raw/2099/2099-01/pagecounts-20990101-030000.tmp

# Out of order (real excerpt)
cat <<EOF | gzip -n9 > raw/2011/2011-01/pagecounts-20110116-100000.gz
en L%C3%A1szl%C3%B3_Szab%C3%B3_(footballer_born_1989) 1 33842
en L%C3%A1szl%C3%B3_Szoll%C3%A1s 3 15824
en L%C3%A1szl%C3%B3_Vask%C3%BAti 1 32309
en L%C3%A1szl%C3%B3_Vincze 1 7640
en L%C3%A1trabjarg 1 377
en Kyriakos 1 9920
en Kyriakos_Kyritsis 1 10331
en Kyriakos_Papadopoulos 4 86219
en Kyriakos_Pittakis 1 7917
en Kyriakos_Tamvakis 1 8358
EOF

# Named .gz but is not gzipped (real)
cp $1/2011/2011-10/pagecounts-20111008-180001.gz raw/2011/2011-10

# Whole file sorted together (dot and non-dot in proper order together)
extract $1/2015/2015-01/pagecounts-20150103-010000.gz

# Contains 100 unparseable lines (synthetic) that pass egrep
(for i in {1..100}; do echo 'a a a a'; done) | gzip -n9 > raw/2099/2099-01/pagecounts-20990101-010000.gz

# Ends in a way that breaks the gzip decompression. The problem was identified
# in pagecounts-20090921-160000.gz.
cat <<EOF | gzip -n9 | dd bs=24 count=1 of=raw/2099/2099-01/pagecounts-20990101-020000.gz
a b 1 1
c d 1 1
e f 1 1
g h 1 1
EOF
